引言

这里直接读取作者给定的第2个病人的 Gene expression analysis: validation patient,用的是 10x Genomics 5’ V(D)J 平台测序.

是 25066 genes across 11071 samples.

载入必要的R包

需要自行下载安装一些必要的R包! 而且需要注意版本 Seurat

因为大量学员在中国大陆,通常不建议大家使用下面的R包安装方法,建议是切换镜像后再下载R包。

参考:http://www.bio-info-trainee.com/3727.html

# 下面代码不运行。
# Enter commands in R (or R studio, if installed)
# Install the devtools package from Hadley Wickham
install.packages('devtools')
# Replace '2.3.0' with your desired version
devtools::install_version(package = 'Seurat', version = package_version('2.3.0'))

library(Seurat)

加载R包

rm(list = ls()) # clear the environment
#load all the necessary libraries
options(warn=-1) # turn off warning message globally
suppressMessages(library(Seurat))

读入文章关于第2个病人的全部表达矩阵

start_time <- Sys.time()
# 如果觉得这里较慢,可以使用 data.table 包的 fread函数。
raw_data <- read.csv('../Output_2018-03-12/GSE118056_raw.expMatrix.csv.gz', header = TRUE, row.names = 1)
end_time <- Sys.time()
end_time - start_time
## Time difference of 1.314043 mins
# 通常电脑一分钟可以搞定。

dim(raw_data) # 11,071 cells and 25,066 genes - already filtered
## [1] 25066 11071
data <- log2(1 + sweep(raw_data, 2, median(colSums(raw_data))/colSums(raw_data), '*')) # Normalization
cellTypes <- sapply(colnames(data), function(x) ExtractField(x, 2, '[.]'))

cellTypes <- ifelse(cellTypes == '1', 'PBMC', 'Tumor')
table(cellTypes)
## cellTypes
##  PBMC Tumor 
##  5748  5323

表达矩阵的质量控制

简单看看表达矩阵的性质,主要是基因数量,细胞数量;以及每个细胞表达基因的数量,和每个基因在多少个细胞里面表达。

# 可以看到, 2万多的基因里面,
# 绝大部分基因只在一万多细胞的至少200个是表达的
fivenum(apply(data,1,function(x) sum(x>0) ))
## AC114752.1      CNTD1       BEX5    FAM91A1     MT-CO1 
##          0         11        258       1806      11068
boxplot(apply(data,1,function(x) sum(x>0) ))

# 可以看到,一万多细胞里面
# 绝大部分细胞可以检测到2000个以上的基因。
fivenum(apply(data,2,function(x) sum(x>0) ))
## AGCTTGATCCTTGCCA.1 TCATTACGTACCTACA.1 ACGGCCACAAGAAGAG.1 
##              206.0             1478.5             2242.0 
## GCTCCTACAAGGTTTC.2 GGCAATTAGTGGTAAT.2 
##             3695.5             9278.0
hist(apply(data,2,function(x) sum(x>0) ))

然后创建Seurat的对象

start_time <- Sys.time()
# Create Seurat object
seurat <- CreateSeuratObject(raw.data = data, project = '10x_MCC_2') # already normalized
dim(data)
## [1] 25066 11071
seurat # 25,066 genes and 11,071 cells
## An object of class seurat in project 10x_MCC_2 
##  25066 genes across 11071 samples.
# 可以看到上面创建Seurat对象的那些参数并没有过滤基因或者细胞。


# Add meta.data (nUMI and cellTypes)
seurat <- AddMetaData(object = seurat, metadata = apply(raw_data, 2, sum), col.name = 'nUMI_raw')
seurat <- AddMetaData(object = seurat, metadata = cellTypes, col.name = 'cellTypes')

一些质控

这里绘图,可以指定分组,前提是这个分组变量存在于meta信息里面,我们创建对象后使用函数添加了 cellTypes 属性,所以可以用来进行可视化。

这里是:‘cellTypes’,就是PMBC和tumor的区别

sce=seurat
VlnPlot(object = sce, 
        features.plot = c("nGene", "nUMI"), 
        group.by = 'cellTypes', nCol = 2)

GenePlot(object = sce, gene1 = "nUMI", gene2 = "nGene")

可以看看高表达量基因是哪些

tail(sort(Matrix::rowSums(sce@raw.data)))
##   MT-CYB    RPL10   MT-CO3   MT-CO2   MT-CO1   EEF1A1 
## 62963.00 64713.21 69561.78 71444.01 73749.92 73919.11
## 散点图可视化任意两个基因的一些属性(通常是细胞的度量)
# 这里选取两个基因。
tmp=names(sort(Matrix::rowSums(sce@raw.data),decreasing = T))
GenePlot(object = sce, gene1 = tmp[1], gene2 = tmp[2])

# 散点图可视化任意两个细胞的一些属性(通常是基因的度量)
# 这里选取两个细胞
CellPlot(sce,sce@cell.names[3],sce@cell.names[4],do.ident = FALSE)

最后标准聚类可视化

start_time <- Sys.time()
# 最耗费时间的步骤在这里。
seurat <- ScaleData(object = seurat, vars.to.regress = c('nUMI_raw'), model.use = 'linear', use.umi = FALSE)
## NormalizeData has not been run, therefore ScaleData is running on non-normalized values. Recommended workflow is to run NormalizeData first.
## ScaleData is running on non-normalized values. Recommended workflow is to run NormalizeData first.
## 
## Time Elapsed:  1.00048394997915 mins
end_time <- Sys.time()
end_time - start_time
## Time difference of 1.588338 mins
seurat <- FindVariableGenes(object = seurat, 
                            mean.function = ExpMean,
                            dispersion.function = LogVMR, 
                            x.low.cutoff = 0.05, 
                            x.high.cutoff = 4, 
                            y.cutoff = 0.5)

head(seurat@var.genes)
## [1] "largeTAntigen" "FO538757.3"    "FO538757.2"    "HES4"         
## [5] "ISG15"         "TNFRSF18"
length(seurat@var.genes)
## [1] 2450
# 这里使用的pcs.compute 比前面两个要多。
seurat <- RunPCA(object = seurat, 
                 pc.genes = seurat@var.genes, 
                 pcs.compute = 40)
## [1] "PC1"
##  [1] "NNAT"          "DNAJB1"        "NEFM"          "HSPH1"        
##  [5] "SOX2"          "FSCN1"         "SPOCK2"        "DDIT4"        
##  [9] "NRN1"          "NEFH"          "RAB3A"         "FABP5"        
## [13] "CKS2"          "HSPA1B"        "ODC1"          "ZWINT"        
## [17] "HSPA1A"        "PLA2G12A"      "PDLIM1"        "CCK"          
## [21] "CD3D"          "IL7R"          "largeTAntigen" "RORA"         
## [25] "SLC25A39"      "NUSAP1"        "SMC4"          "CD69"         
## [29] "HSPB1"         "SYNE1"        
## [1] ""
##  [1] "AIF1"     "SPI1"     "TYMP"     "FCN1"     "BCL2A1"   "SERPINA1"
##  [7] "CTSS"     "ICAM1"    "S100A11"  "LST1"     "FCER1G"   "PLEK"    
## [13] "NLRP3"    "CD68"     "CXCL8"    "TNFAIP2"  "CFD"      "CD83"    
## [19] "PLAUR"    "NEAT1"    "LGALS1"   "NINJ1"    "CFP"      "S100A8"  
## [25] "CCL3L3"   "CLEC7A"   "LILRB2"   "NAMPT"    "IFNGR2"   "WARS"    
## [1] ""
## [1] ""
## [1] "PC2"
##  [1] "NNAT"     "H1F0"     "FSCN1"    "NEFM"     "FABP5"    "TXN"     
##  [7] "NRN1"     "SOX2"     "HSPH1"    "NEFH"     "MAFB"     "GPX1"    
## [13] "PDLIM1"   "HSPA1B"   "KLF4"     "PLA2G12A" "LMNA"     "PGRMC1"  
## [19] "ZWINT"    "RAB3A"    "TSC22D1"  "TPTEP1"   "CKS2"     "HSPA1A"  
## [25] "YWHAH"    "RHOB"     "CCK"      "ARC"      "SEPT5"    "CTTN"    
## [1] ""
##  [1] "CD7"      "ZAP70"    "CD247"    "TBC1D10C" "CTSW"     "CD69"    
##  [7] "TRBC1"    "CST7"     "GZMA"     "GZMM"     "BIN2"     "TSC22D3" 
## [13] "KLRB1"    "PRKCH"    "CLEC2B"   "TRAF3IP3" "SKAP1"    "IL2RB"   
## [19] "PRF1"     "MATK"     "HOPX"     "GNLY"     "GZMB"     "ARL4C"   
## [25] "P2RY8"    "CD3D"     "FGFBP2"   "WIPF1"    "ARHGAP15" "SPON2"   
## [1] ""
## [1] ""
## [1] "PC3"
##  [1] "TUBB1"      "GP9"        "C6orf25"    "GNG11"      "PF4"       
##  [6] "SDPR"       "CMTM5"      "RGS18"      "ACRBP"      "F13A1"     
## [11] "PTGS1"      "HRAT92"     "FO538757.3" "TMEM40"     "PPBP"      
## [16] "CLU"        "TREML1"     "PTCRA"      "DMTN"       "ESAM"      
## [21] "GRAP2"      "LINC00989"  "HIST1H2AC"  "TRIM58"     "CLEC1B"    
## [26] "C2orf88"    "PRKAR2B"    "ITGA2B"     "RUFY1"      "CLDN5"     
## [1] ""
##  [1] "MAFB"     "NNAT"     "IFNGR2"   "FSCN1"    "NAMPT"    "NEFM"    
##  [7] "NCF1"     "KLF4"     "TXN"      "HSPH1"    "FABP5"    "NRN1"    
## [13] "PLAUR"    "SOX2"     "S100A8"   "ACSL1"    "TNFAIP2"  "HSPA1B"  
## [19] "CXCL8"    "NEFH"     "CD83"     "ATP2B1"   "CCL3L3"   "S100A12" 
## [25] "SGK1"     "HSPA1A"   "BCL2A1"   "DMXL2"    "HLA-DQB1" "TMEM176A"
## [1] ""
## [1] ""
## [1] "PC4"
##  [1] "CD79A"     "MS4A1"     "IGHM"      "LINC00926" "FAM129C"  
##  [6] "BANK1"     "FOXP1"     "IL7R"      "CCR7"      "BLK"      
## [11] "IGHD"      "PIM2"      "FCER2"     "CD19"      "CXCR4"    
## [16] "MAL"       "AFF3"      "TCF7"      "TAGAP"     "LEF1"     
## [21] "MYC"       "BIRC3"     "CD22"      "SELL"      "TSC22D3"  
## [26] "TCL1A"     "CD3D"      "RHOH"      "CD69"      "ADAM28"   
## [1] ""
##  [1] "FGFBP2"   "GNLY"     "GZMB"     "CST7"     "PRF1"     "SPON2"   
##  [7] "KLRD1"    "GZMH"     "HOPX"     "KLRF1"    "S1PR5"    "GZMA"    
## [13] "CLIC3"    "ADGRG1"   "CTSW"     "PRSS23"   "TTC38"    "MATK"    
## [19] "IL2RB"    "IGFBP7"   "TRDC"     "SH2D1B"   "FCGR3A"   "FCRL6"   
## [25] "TBX21"    "APMAP"    "CMC1"     "C12orf75" "EFHD2"    "RHOC"    
## [1] ""
## [1] ""
## [1] "PC5"
##  [1] "IL7R"      "CD3D"      "TCF7"      "MAL"       "FLT3LG"   
##  [6] "LEF1"      "INPP4B"    "CD3G"      "CD6"       "TC2N"     
## [11] "CD5"       "CD2"       "RCAN3"     "AQP3"      "GIMAP7"   
## [16] "TRAC"      "SPOCK2"    "SERINC5"   "CD28"      "TRABD2A"  
## [21] "LIME1"     "PIK3IP1"   "LINC00861" "TRAT1"     "GATA3"    
## [26] "OXNAD1"    "CD27"      "S1PR1"     "LAT"       "TTC39C"   
## [1] ""
##  [1] "CD79A"     "MS4A1"     "IGHM"      "LINC00926" "FAM129C"  
##  [6] "BANK1"     "IGHD"      "CD19"      "BLK"       "FCER2"    
## [11] "ADAM28"    "CD22"      "TCL1A"     "AFF3"      "IGKC"     
## [16] "RALGPS2"   "FCRL1"     "ARHGAP24"  "VPREB3"    "CD79B"    
## [21] "MEF2C"     "HLA-DPB1"  "HLA-DQA1"  "FCRLA"     "KIAA0226L"
## [26] "HVCN1"     "BLNK"      "POU2AF1"   "TNFRSF13C" "SWAP70"   
## [1] ""
## [1] ""
seurat <- RunTSNE(object = seurat, dims.use = 1:10, perplexity = 50, do.fast = TRUE) 

start_time <- Sys.time()
## 避免太多log日志被打印出来。
seurat <- FindClusters(object = seurat, 
                       reduction.type = 'pca', 
                       dims.use = 1:10, 
                       resolution = 0.6, 
                       print.output = 0,
                      k.param = 20, 
                      save.SNN = TRUE)


TSNEPlot(seurat, group.by = 'cellTypes')

TSNEPlot(seurat,group.by = "ident",pt.shape ='cellTypes') 

end_time <- Sys.time()
end_time - start_time
## Time difference of 10.80697 secs

输出seurat结果后面使用

start_time <- Sys.time()
save(seurat,file = 'patient2.seurat.output.Rdata')
end_time <- Sys.time()
end_time - start_time
## Time difference of 1.98285 mins
# 这个步骤会输出文件  

实际上最后的图也需要标记细胞类群,文章如下:

作者使用的marker基因列表也可以在文章附件找到,如下:

显示运行环境

sessionInfo()
## R version 3.5.1 (2018-07-02)
## Platform: x86_64-apple-darwin15.6.0 (64-bit)
## Running under: macOS  10.14
## 
## Matrix products: default
## BLAS: /Library/Frameworks/R.framework/Versions/3.5/Resources/lib/libRblas.0.dylib
## LAPACK: /Library/Frameworks/R.framework/Versions/3.5/Resources/lib/libRlapack.dylib
## 
## locale:
## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
## 
## attached base packages:
## [1] stats     graphics  grDevices utils     datasets  methods   base     
## 
## other attached packages:
## [1] Seurat_2.3.4  Matrix_1.2-14 cowplot_0.9.3 ggplot2_3.2.0
## 
## loaded via a namespace (and not attached):
##   [1] Rtsne_0.15          colorspace_1.4-0    class_7.3-14       
##   [4] modeltools_0.2-22   ggridges_0.5.1      mclust_5.4.1       
##   [7] rprojroot_1.3-2     htmlTable_1.13.1    base64enc_0.1-3    
##  [10] rstudioapi_0.9.0    proxy_0.4-22        npsurv_0.4-0       
##  [13] flexmix_2.3-14      bit64_0.9-7         codetools_0.2-15   
##  [16] splines_3.5.1       R.methodsS3_1.7.1   lsei_1.2-0         
##  [19] robustbase_0.93-3   knitr_1.21          jsonlite_1.6       
##  [22] Formula_1.2-3       ica_1.0-2           cluster_2.0.7-1    
##  [25] kernlab_0.9-27      png_0.1-7           R.oo_1.22.0        
##  [28] compiler_3.5.1      httr_1.3.1          backports_1.1.3    
##  [31] assertthat_0.2.0    lazyeval_0.2.1      lars_1.2           
##  [34] acepack_1.4.1       htmltools_0.3.6     tools_3.5.1        
##  [37] bindrcpp_0.2.2      igraph_1.2.2        gtable_0.2.0       
##  [40] glue_1.3.0          RANN_2.6            reshape2_1.4.3     
##  [43] dplyr_0.7.8         Rcpp_1.0.0          gdata_2.18.0       
##  [46] ape_5.2             nlme_3.1-137        iterators_1.0.10   
##  [49] fpc_2.2-3           gbRd_0.4-11         lmtest_0.9-36      
##  [52] xfun_0.4            stringr_1.3.1       irlba_2.3.2        
##  [55] gtools_3.8.1        DEoptimR_1.0-8      MASS_7.3-50        
##  [58] zoo_1.8-4           scales_1.0.0        doSNOW_1.0.16      
##  [61] parallel_3.5.1      RColorBrewer_1.1-2  yaml_2.2.0         
##  [64] reticulate_1.10     pbapply_1.3-4       gridExtra_2.3      
##  [67] rpart_4.1-13        segmented_0.5-3.0   latticeExtra_0.6-28
##  [70] stringi_1.2.4       foreach_1.4.4       checkmate_1.9.1    
##  [73] caTools_1.17.1.1    bibtex_0.4.2        Rdpack_0.10-1      
##  [76] SDMTools_1.1-221    rlang_0.3.1         pkgconfig_2.0.2    
##  [79] dtw_1.20-1          prabclus_2.2-6      bitops_1.0-6       
##  [82] evaluate_0.12       lattice_0.20-35     ROCR_1.0-7         
##  [85] purrr_0.3.0         bindr_0.1.1         labeling_0.3       
##  [88] htmlwidgets_1.3     bit_1.1-14          tidyselect_0.2.5   
##  [91] plyr_1.8.4          magrittr_1.5        R6_2.3.0           
##  [94] snow_0.4-3          gplots_3.0.1.1      Hmisc_4.2-0        
##  [97] pillar_1.3.1        foreign_0.8-70      withr_2.1.2        
## [100] fitdistrplus_1.0-11 mixtools_1.1.0      survival_2.42-3    
## [103] nnet_7.3-12         tsne_0.1-3          tibble_2.0.1       
## [106] crayon_1.3.4        hdf5r_1.0.1         KernSmooth_2.23-15 
## [109] rmarkdown_1.10      grid_3.5.1          data.table_1.12.0  
## [112] metap_1.0           digest_0.6.18       diptest_0.75-7     
## [115] tidyr_0.8.1         R.utils_2.7.0       stats4_3.5.1       
## [118] munsell_0.5.0